"""
大众点评标题搜索
"""

from lxml import etree
import requests
import sys
import time
import random


# 公共请求头
_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
    'Referer': 'https://www.dianping.com',
    'Host': 'www.dianping.com',
    'Cookie': '_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=164d15ff978c8-0cbe23bbf84576-2711938-1fa400-164d15ff979c8; _lxsdk=164d15ff978c8-0cbe23bbf84576-2711938-1fa400-164d15ff979c8; _hc.v=fae0f77d-4696-8f16-1a2b-4514cd156e5d.1532521085; s_ViewType=10; dper=73fa6c226f7cd7ed86b79623b023220565efe9676372e43d101d3b84ef2a6913fb4f1afdb36ecccf8ec1eb9208443c739d288d0337b5dc1815bb59fe98c00f1feb7cc254527f2d184594c6eb4947f013872006854f39c5100ceb30e5f9946f17; ll=7fd06e815b796be3df069dec7836c3df; ua=%E4%BD%95%E4%BA%AE_6099; ctu=ab36297f80c658951bfa28a9c104a994175f5ec135840a10ea4f66c9fdc84086; uamo=18817330810; cy=1; cye=shanghai; _lxsdk_s=164d15ff979-551-c75-c83%7C%7C439'
}


def main_search_url(city, kw):
    """
    生成单个搜索主页
    """

    search_root_url = 'https://www.dianping.com/search/keyword/'
    city_code = {'beijing': '0', 'shanghai': '1'}
    return search_root_url+city_code[city]+'/0_'+kw


def max_pages(city, kw):
    """
    返回翻页数
    """

    root_url = main_search_url(city, kw)
    try:
        r = requests.get(root_url, headers=_headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
    except Exception as e:
        print('[ERR0] 页数获取失败:', e)
        sys.exit(1)

    tree = etree.HTML(r.text)
    pages = tree.xpath('//div[3]/div[1]/div[2]/a/text()')
    if pages:
        return max(list(map(int, list(filter(str.isnumeric, pages)))))
    else:
        return 0


def single_page_titles(city, kw, page):
    """
    返回单个页面搜索页面所有标题
    """

    root_url = main_search_url(city, kw)
    cur_url = root_url+'/p'+str(page)

    try:
        r = requests.get(cur_url, headers=_headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        tree = etree.HTML(r.text)
        title = tree.xpath('//a/h4/text()')
        return title
    except Exception as e:
        print('[ERR1] 页面打开失败:', cur_url, '原因:', e)
        return []


def title_collection(city, kw, time_pause=1):
    """
    单进程采集
    """

    pages = max_pages(city, kw)
    titles = []
    if pages:
        for p in range(1, pages+1):
            cur_titles = single_page_titles(city, kw, p)
            titles.extend(cur_titles)
            print(f'{p} / {pages} 采集完毕...')
            time.sleep(time_pause+random.random())
    else:
        titles = single_page_titles(city, kw, 1)

    return titles


if __name__ == '__main__':
    city = 'shanghai'
    kw = '面包'
    fn = '文件名.txt'
    titles = list(set(title_collection(city, kw, time_pause=0)))
    with open(fn, 'w', encoding='utf-8') as f:
        f.write('\n'.join(titles))
    # print('\n'.join(titles))
